############
#
# Load data
#
############

dv <- read.csv("WCAttrib.csv")

# subset by gender as our edges are only for males
dv <- subset(dv, gender == "Male")

# Since we are also interested in predicting recidivism, we should construct a new variable
# A value of 1 is if the member at any point recidivated (regardless of if 1 or 2)
dv$recidivist <- rep(0, times = nrow(dv))
dv$recidivist[dv$recidflag1 > 0] <- 1
dv$recidivist[dv$recidflag == 0] <- 0
table(dv$recidivist)

################
#
# Load Networks
# 
################

# Edge list for affirmations
ledge <- read.table("WCM-ledge")
colnames(ledge) <- c("eventDate", "sender", "receiver", "countEvents")
length(unique(ledge$eventDate)) # [1] 2043
length(unique(ledge$sender)) # [1] 1661
length(unique(ledge$receiver)) # [1] 1716

# Edge list for corrections
sedge <- read.table("WCM-sedge")
colnames(sedge) <- c("eventDate", "sender", "receiver", "countEvents")
length(unique(sedge$eventDate)) # [1] 2401
length(unique(sedge$sender)) # [1] 1658
length(unique(sedge$receiver)) # [1] 1710

################
#
# Clean data
# 
################

dv$enterClean <- as.Date(as.character(dv$enter),format = '%m/%d/%Y')
dv$exitClean <- as.Date(as.character(dv$exit),format = '%m/%d/%Y')

library(stringr)
dayEnter <- str_sub(dv$enterClean, start = -2)
monthEnter <- str_sub(dv$enterClean, start = -5, end = -4)
yearEnter <- str_sub(dv$enterClean, start = 1, end = 4)

dayExit <- str_sub(dv$exitClean, start = -2)
monthExit <- str_sub(dv$exitClean, start = -5, end = -4)
yearExit <- str_sub(dv$exitClean, start = 1, end = 4)

enterClean <- paste(yearEnter, monthEnter, dayEnter, sep = '') 
exitClean <- paste(yearExit, monthExit, dayExit, sep = '')
dv$enterClean <- enterClean
dv$exitClean <- exitClean
head(dv)
dv$row <- row.names(dv)

# Clean network dates
sedge$eventDateClean <- as.Date(as.character(sedge$eventDate),format = '%m/%d/%Y')

library(stringr)
day <- str_sub(sedge$eventDateClean, start = -2)
month  <- str_sub(sedge$eventDateClean, start = -5, end = -4)
year <- str_sub(sedge$eventDateClean, start = 1, end = 4)

sedge$eventDateClean <- paste(year, month, day, sep = '') 

ledge$eventDateClean <- as.Date(as.character(ledge$eventDate),format = '%m/%d/%Y')

day <- str_sub(ledge$eventDateClean, start = -2)
month  <- str_sub(ledge$eventDateClean, start = -5, end = -4)
year <- str_sub(ledge$eventDateClean, start = 1, end = 4)

ledge$eventDateClean <- paste(year, month, day, sep = '') 

# to make it easier, I'm going to make the wcid numbers more workable

nwcid <- as.character(dv$wcid)
wcida <- str_sub(nwcid, start = 1, end = 2)
wcidb <- str_sub(nwcid, start = 4, end = 5)
wcidc <- str_sub(nwcid, start = 7, end = 9)
nwcid <- paste(wcida, wcidb, wcidc, sep = '')
dv$wcid_new <- nwcid


# Now, I need to check and see which observations are duplicated (as per Philip's suggestion)
duplicatedObs <- dv[which(duplicated(dv$wcid) | duplicated(dv$wcid, fromLast = TRUE)),]
# 13 individuals who came back

repeatOffenders <- unique(duplicatedObs$wcid)

sedge_repeat <- data.frame()
ledge_repeat <- data.frame()
dv_repeat <- data.frame()

for(i in 1:length(repeatOffenders)){
  id <- repeatOffenders[i]
  records <- duplicatedObs[duplicatedObs$wcid == id,]
  
  # sort these repeat offenders so that itis first time in then last time, then label these
  records <- records[order(records$enterClean),] 
  timesRepeated <- 2
  newIds <- paste(id, ".", 1:timesRepeated, sep = "")
  records$wcid <- newIds
  
  # then I want to do the same laeling according to sedge and ledge, both for senders and recievers, based upon timing
  sedge_repeat <- sedge[sedge$receiver == id | sedge$sender == id,]
  ledge_repeat <- ledge[ledge$receiver == id | ledge$sender == id,]
  
  temp_sedge <- data.frame()
  temp_ledge <- data.frame()
  
  for(t in 1:2){
    record_t <- records[t,]
    id_t <- newIds[t]
    
    sedge_t <- sedge_repeat[which(sedge_repeat$eventDateClean >= record_t$enterClean & sedge_repeat$eventDateClean <= record_t$exitClean),]
    sedge_t_senders <- which(sedge_t$sender == id)
    sedge_t$sender[sedge_t_senders] <- id_t
    sedge_t_receiver <- which(sedge_t$receiver == id)
    sedge_t$receiver[sedge_t_receiver] <- id_t
    
    ledge_t <- ledge_repeat[which(ledge_repeat$eventDateClean >= record_t$enterClean & ledge_repeat$eventDateClean <= record_t$exitClean),]
    ledge_t_senders <- which(ledge_t$sender == id)
    ledge_t$sender[ledge_t_senders] <- id_t
    ledge_t_receiver <- which(ledge_t$receiver == id)
    ledge_t$receiver[ledge_t_receiver] <- id_t
    
    temp_sedge <- rbind(temp_sedge, sedge_t)
    temp_ledge <- rbind(temp_ledge, ledge_t)
  }
  
  sedge_repeat <- rbind(sedge_repeat, temp_sedge)
  ledge_repeat <- rbind(ledge_repeat, temp_ledge)
  dv_repeat <- rbind(dv_repeat, records)
  # end loop
}

# Now, we must go through and update the sedge list to remove those repeated offenders and supplement with sedge_repeat and ledge_repeat
sedge_remove <- c(which(sedge$sender %in% repeatOffenders), which(sedge$receiver %in% repeatOffenders))
ledge_remove <- c(which(ledge$sender %in% repeatOffenders), which(ledge$receiver %in% repeatOffenders))

sedge <- sedge[-c(sedge_remove),]
ledge <- ledge[-c(ledge_remove),]

sedge <- rbind(sedge, sedge_repeat)
ledge <- rbind(ledge, ledge_repeat)

# Do the same with dv
dv <- dv[order(dv$enterClean),] 
dv_remove <- c(which(dv$wcid %in% repeatOffenders))
dv <- dv[-c(dv_remove),]
dv <- rbind(dv, dv_repeat)

# remove sedge or ledge where sender == 0
sedge <- sedge[sedge$sender != 0,]
ledge <- ledge[ledge$sender != 0,]

# Now, let's produce edgelists of affirmations (ledge) and corrections (sedge)
affirm_el <- data.frame(Source = ledge$sender,
                        Target = ledge$receiver)


correct_el <- data.frame(Source = sedge$sender,
                         Target = sedge$receiver)

# Turn these into weighted edge lists
library(plyr)
affirm_el <- ddply(affirm_el,.(Source,Target),nrow)
colnames(affirm_el)[3] <- "Weight"

correct_el <- ddply(correct_el,.(Source,Target),nrow)
colnames(correct_el)[3] <- "Weight"

# Turn these into adjacency
library(igraph)
g <- graph.data.frame(affirm_el)
affirm_adj <- get.adjacency(g,sparse=FALSE, attr="Weight")
affirm_adj <- list(t1 = affirm_adj)

g <- graph.data.frame(correct_el)
correct_adj <- get.adjacency(g,sparse=FALSE, attr="Weight")
correct_adj <- list(t1 = correct_adj)

success_list <- dv$success
names(success_list) <- dv$wcid
success_list <- list(t1 = success_list)

recidivism_list <- dv$recidivist
names(recidivism_list) <- dv$wcid
recidivism_list <- list(t1 = recidivism_list)

age_list <- dv$age
names(age_list) <- dv$wcid
age_list <- list(t1 = age_list)

black_list <- dv$black
names(black_list) <- dv$wcid
black_list <- list(t1 = black_list)

lsi_list <- dv$lsi
names(lsi_list) <- dv$wcid
lsi_list <- list(t1 = lsi_list)

lsi_diff_list <- dv$lsiExit - dv$lsi 
names(lsi_diff_list) <- dv$wcid
lsi_diff_list <- list(t1 = lsi_diff_list)

lsiExit_list <- dv$lsiExit
names(lsiExit_list) <- dv$wcid
lsiExit_list <- list(t1 = lsiExit_list)

# Calculate reciprocity of affirmations by member
ids <- rownames(affirm_adj$t1)
reciprocity_rate <- list()
reciprocity_outdegree_rate <- list()
for(i in 1:nrow(affirm_adj$t1)){
  ### reciprocity_rate fill up
  # get the sent ties
  sent_ties <- which(affirm_adj$t1[which(ids == ids[i]),] != 0)
  # get the recieved ties
  recieved_ties <- which(affirm_adj$t1[,which(ids == ids[i])] != 0)
  # get the id's of members that they both shared and recieved ties from
  overlap_a <- which(names(sent_ties) %in% names(recieved_ties))
  
  # need to subset down so of same lnegth
  idNew_a <- names(sent_ties[overlap_a])
  
  sent_ties <- sent_ties[which(names(sent_ties) %in% idNew_a)]
  
  names_0 <- setdiff(names(recieved_ties), names(sent_ties))
  fill <- rep(0, length(names_0))
  names(fill) <- names_0
  
  sent_ties <- c(sent_ties, fill)
  
  sent_ties<-sent_ties[order(match(names(sent_ties),names(recieved_ties)))]
  
  averages <- sent_ties/recieved_ties
  averages <- ifelse(averages > 1, 1, averages)
  average <- mean(averages, na.rm=TRUE)
  average <- average*100
  reciprocity_rate[[i]] <- average
  
  ### reciprocity_outdegree_rate fill up
  # get the sent ties
  sent_ties <- which(affirm_adj$t1[which(ids == ids[i]),] != 0)
  # get the recieved ties
  recieved_ties <- which(affirm_adj$t1[,which(ids == ids[i])] != 0)
  overlap_b <- which(names(recieved_ties) %in% names(sent_ties))
  
  # need to subset down so of same lnegth
  sent_ties <- which(affirm_adj$t1[which(ids == ids[i]),] != 0)
  
  idNew_b <- names(recieved_ties[overlap_b])
  
  recieved_ties <- recieved_ties[which(names(recieved_ties) %in% idNew_b)]
  names_0 <- setdiff(names(sent_ties), names(recieved_ties))
  fill <- rep(0, length(names_0))
  names(fill) <- names_0
  
  recieved_ties <- c(recieved_ties, fill)
  
  recieved_ties<-recieved_ties[order(match(names(recieved_ties),names(sent_ties)))]
  
  averages <- recieved_ties/sent_ties
  averages <- ifelse(averages > 1, 1, averages)
  average <- mean(averages, na.rm=TRUE)
  average <- average*100
  reciprocity_outdegree_rate[[i]] <- average
  
}
reciprocity_rate <- unlist(reciprocity_rate)
names(reciprocity_rate) <- ids
reciprocity_rate_list <- list(t1 = reciprocity_rate)

reciprocity_outdegree_rate <- unlist(reciprocity_outdegree_rate)
names(reciprocity_outdegree_rate) <- ids
reciprocity_outdegree_rate_list <- list(t1 = reciprocity_outdegree_rate)


# Calculate time in community
library(lubridate)
finish <- ymd(dv$exitClean)
start <- ymd(dv$enterClean)
elapsed <-  difftime(finish,start,units="days")

names(elapsed) <- ids
days_list <- list(t1 = elapsed)

# Calculate reciprocity of Corrections by member
ids <- rownames(correct_adj$t1)
reciprocity_rate_correct <- list()
reciprocity_outdegree_rate_correct <- list()
for(i in 1:nrow(correct_adj$t1)){
  # get the sent ties
  sent_ties <- which(correct_adj$t1[which(ids == ids[i]),] != 0)
  # get the recieved ties
  recieved_ties <- which(correct_adj$t1[,which(ids == ids[i])] != 0)
  # get the id's of members that they both shared and recieved ties from
  overlap_a <- which(names(sent_ties) %in% names(recieved_ties))
  
  # need to subset down so of same lnegth
  idNew_a <- names(sent_ties[overlap_a])
  
  sent_ties <- sent_ties[which(names(sent_ties) %in% idNew_a)]
  
  names_0 <- setdiff(names(recieved_ties), names(sent_ties))
  fill <- rep(0, length(names_0))
  names(fill) <- names_0
  
  sent_ties <- c(sent_ties, fill)
  
  sent_ties<-sent_ties[order(match(names(sent_ties),names(recieved_ties)))]
  
  averages <- sent_ties/recieved_ties
  averages <- ifelse(averages > 1, 1, averages)
  average <- mean(averages, na.rm=TRUE)
  average <- average*100
  reciprocity_rate_correct[[i]] <- average
  
  ### reciprocity_outdegree_rate fill up
  # need to subset down so of same lnegth
  sent_ties <- which(correct_adj$t1[which(ids == ids[i]),] != 0)
  recieved_ties <- which(correct_adj$t1[,which(ids == ids[i])] != 0)
  overlap_b <- which(names(recieved_ties) %in% names(sent_ties))
  
  idNew_b <- names(recieved_ties[overlap_b])
  
  recieved_ties <- recieved_ties[which(names(recieved_ties) %in% idNew_b)]
  names_0 <- setdiff(names(sent_ties), names(recieved_ties))
  fill <- rep(0, length(names_0))
  names(fill) <- names_0
  
  recieved_ties <- c(recieved_ties, fill)
  
  recieved_ties<-recieved_ties[order(match(names(recieved_ties),names(sent_ties)))]
  
  averages <- recieved_ties/sent_ties
  averages <- ifelse(averages > 1, 1, averages)
  average <- mean(averages, na.rm=TRUE)
  average <- average*100
  reciprocity_outdegree_rate_correct[[i]] <- average
}
reciprocity_rate_correct <- unlist(reciprocity_rate_correct)
names(reciprocity_rate_correct) <- ids
reciprocity_rate_correct_list <- list(t1 = reciprocity_rate)
